# -*- coding: utf-8 -*-
import re
import pandas as pd
import jieba.analyse
# 数据读取
datas = pd.read_csv('/data/dataset/weibo/5a.csv',encoding='utf-8')
# print(datas.head(5))
# 热点话题提取前15
datas['热点值'] = datas['转发量']*5+datas['评论量']*3+datas['点赞量']
top_15 = datas.sort_values(['热点值'], ascending=False).head(15).reset_index().drop('index',axis=1)
print(top_15['话题内容'])
# 停用词处理
jieba.analyse.set_stop_words('/data/dataset/weibo/chineseStopWords.txt')
# 对话题内容进一步分词
for i in top_15['话题内容']:
    # 进一步去除影响的词语
    i = re.sub('[0-9’!'#$%&\'()*+,-./:;<=>?@，。～?★、…【】《》？“”‘’！[\\]^_`{|}~\s]+', '', i)
    # 热门关键词提取
    res = jieba.analyse.extract_tags(i, topK=2)
    print(res[0]+res[1])